{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import random\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
      "Observation space: Box(4,)\n",
      "Action space: Discrete(2)\n"
     ]
    }
   ],
   "source": [
    "env_name = \"CartPole-v0\"\n",
    "env = gym.make(env_name)\n",
    "print(\"Observation space:\", env.observation_space)\n",
    "print(\"Action space:\", env.action_space)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class HillClimbingAgent():\n",
    "    def __init__(self, env):\n",
    "        self.state_dim = env.observation_space.shape\n",
    "        self.action_size = env.action_space.n\n",
    "        self.build_model()\n",
    "        \n",
    "    def build_model(self):\n",
    "        self.weights = 1e-4*np.random.rand(*self.state_dim, self.action_size)\n",
    "        self.best_reward = -np.Inf\n",
    "        self.best_weights = np.copy(self.weights)\n",
    "        self.noise_scale = 1e-2\n",
    "        \n",
    "    def get_action(self, state):\n",
    "        p = np.dot(state, self.weights)\n",
    "        action = np.argmax(p)\n",
    "        return action\n",
    "    \n",
    "    def update_model(self, reward):\n",
    "        if reward >= self.best_reward:\n",
    "            self.best_reward = reward\n",
    "            self.best_weights = np.copy(self.weights)\n",
    "            self.noise_scale = max(self.noise_scale/2, 1e-3)\n",
    "        else:\n",
    "            self.noise_scale = min(self.noise_scale*2, 2)\n",
    "            \n",
    "        self.weights = self.best_weights + self.noise_scale * np.random.rand(*self.state_dim, self.action_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Episode: 0, total_reward: 50.00\n",
      "Episode: 1, total_reward: 9.00\n",
      "Episode: 2, total_reward: 74.00\n",
      "Episode: 3, total_reward: 79.00\n",
      "Episode: 4, total_reward: 156.00\n",
      "Episode: 5, total_reward: 44.00\n",
      "Episode: 6, total_reward: 68.00\n",
      "Episode: 7, total_reward: 61.00\n",
      "Episode: 8, total_reward: 12.00\n",
      "Episode: 9, total_reward: 41.00\n",
      "Episode: 10, total_reward: 10.00\n",
      "Episode: 11, total_reward: 45.00\n",
      "Episode: 12, total_reward: 57.00\n",
      "Episode: 13, total_reward: 200.00\n",
      "Episode: 14, total_reward: 112.00\n",
      "Episode: 15, total_reward: 73.00\n",
      "Episode: 16, total_reward: 132.00\n",
      "Episode: 17, total_reward: 29.00\n",
      "Episode: 18, total_reward: 10.00\n",
      "Episode: 19, total_reward: 52.00\n",
      "Episode: 20, total_reward: 75.00\n",
      "Episode: 21, total_reward: 9.00\n",
      "Episode: 22, total_reward: 10.00\n",
      "Episode: 23, total_reward: 48.00\n",
      "Episode: 24, total_reward: 200.00\n",
      "Episode: 25, total_reward: 200.00\n",
      "Episode: 26, total_reward: 200.00\n",
      "Episode: 27, total_reward: 200.00\n",
      "Episode: 28, total_reward: 200.00\n",
      "Episode: 29, total_reward: 200.00\n",
      "Episode: 30, total_reward: 200.00\n",
      "Episode: 31, total_reward: 200.00\n",
      "Episode: 32, total_reward: 200.00\n",
      "Episode: 33, total_reward: 200.00\n",
      "Episode: 34, total_reward: 200.00\n",
      "Episode: 35, total_reward: 200.00\n",
      "Episode: 36, total_reward: 200.00\n",
      "Episode: 37, total_reward: 200.00\n",
      "Episode: 38, total_reward: 200.00\n",
      "Episode: 39, total_reward: 200.00\n",
      "Episode: 40, total_reward: 200.00\n",
      "Episode: 41, total_reward: 200.00\n",
      "Episode: 42, total_reward: 200.00\n",
      "Episode: 43, total_reward: 200.00\n",
      "Episode: 44, total_reward: 200.00\n",
      "Episode: 45, total_reward: 200.00\n",
      "Episode: 46, total_reward: 200.00\n",
      "Episode: 47, total_reward: 200.00\n",
      "Episode: 48, total_reward: 200.00\n",
      "Episode: 49, total_reward: 200.00\n",
      "Episode: 50, total_reward: 200.00\n",
      "Episode: 51, total_reward: 200.00\n",
      "Episode: 52, total_reward: 200.00\n",
      "Episode: 53, total_reward: 200.00\n",
      "Episode: 54, total_reward: 200.00\n",
      "Episode: 55, total_reward: 200.00\n",
      "Episode: 56, total_reward: 200.00\n",
      "Episode: 57, total_reward: 200.00\n",
      "Episode: 58, total_reward: 200.00\n",
      "Episode: 59, total_reward: 200.00\n",
      "Episode: 60, total_reward: 200.00\n",
      "Episode: 61, total_reward: 200.00\n",
      "Episode: 62, total_reward: 200.00\n",
      "Episode: 63, total_reward: 200.00\n",
      "Episode: 64, total_reward: 200.00\n",
      "Episode: 65, total_reward: 200.00\n",
      "Episode: 66, total_reward: 200.00\n",
      "Episode: 67, total_reward: 200.00\n",
      "Episode: 68, total_reward: 200.00\n",
      "Episode: 69, total_reward: 200.00\n",
      "Episode: 70, total_reward: 200.00\n",
      "Episode: 71, total_reward: 200.00\n",
      "Episode: 72, total_reward: 200.00\n",
      "Episode: 73, total_reward: 200.00\n",
      "Episode: 74, total_reward: 200.00\n",
      "Episode: 75, total_reward: 200.00\n",
      "Episode: 76, total_reward: 200.00\n",
      "Episode: 77, total_reward: 200.00\n",
      "Episode: 78, total_reward: 200.00\n",
      "Episode: 79, total_reward: 200.00\n",
      "Episode: 80, total_reward: 200.00\n",
      "Episode: 81, total_reward: 200.00\n",
      "Episode: 82, total_reward: 200.00\n",
      "Episode: 83, total_reward: 200.00\n",
      "Episode: 84, total_reward: 200.00\n",
      "Episode: 85, total_reward: 200.00\n",
      "Episode: 86, total_reward: 200.00\n",
      "Episode: 87, total_reward: 200.00\n",
      "Episode: 88, total_reward: 200.00\n",
      "Episode: 89, total_reward: 200.00\n",
      "Episode: 90, total_reward: 200.00\n",
      "Episode: 91, total_reward: 200.00\n",
      "Episode: 92, total_reward: 200.00\n",
      "Episode: 93, total_reward: 200.00\n",
      "Episode: 94, total_reward: 200.00\n",
      "Episode: 95, total_reward: 200.00\n",
      "Episode: 96, total_reward: 200.00\n",
      "Episode: 97, total_reward: 200.00\n",
      "Episode: 98, total_reward: 200.00\n",
      "Episode: 99, total_reward: 200.00\n"
     ]
    }
   ],
   "source": [
    "agent = HillClimbingAgent(env)\n",
    "num_episodes = 100\n",
    "\n",
    "for ep in range(num_episodes):\n",
    "    state = env.reset()\n",
    "    total_reward = 0\n",
    "    done = False\n",
    "    while not done:\n",
    "        action = agent.get_action(state)\n",
    "        state, reward, done, info = env.step(action)\n",
    "        env.render()\n",
    "        total_reward += reward\n",
    "        \n",
    "    agent.update_model(total_reward)\n",
    "    print(\"Episode: {}, total_reward: {:.2f}\".format(ep, total_reward))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
